home *** CD-ROM | disk | FTP | other *** search
- #
- # HTMLBody.py
- # JunkMatcher
- #
- # Created by Benjamin Han on 2/1/05.
- # Copyright (c) 2005 Benjamin Han. All rights reserved.
- #
-
- # This program is free software; you can redistribute it and/or
- # modify it under the terms of the GNU General Public License
- # as published by the Free Software Foundation; either version 2
- # of the License, or (at your option) any later version.
-
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
-
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
- #!/usr/bin/env python
-
- import htmlentitydefs, urllib
-
- from consts import *
- from utilities import *
- from GlobalObjects import *
- from HTMLEncoding import *
-
-
- _TAG_FOR_HIDDEN_URL = u''
-
- _entityPat = re.compile(r'?\w+;')
- _tagPat = re.compile(r'<\s*([^>\s]+)([^>]*)>')
- _vacuousTagPat = re.compile(r'(?s)(\S)<([^>/\s]+)([^>]*)>\s*</([^>/\s]+)[^>]*>(\S)')
- _attrPat = re.compile(r'(?i)(?:([^"\'<>=\s]+)\s*=\s*["\']?\s*)?([^"\'<>\s]+)["\']?')
-
- _ELINKS_ARGS='-force-html -auto-submit 0 -dump 1 -no-home 1 -stdin 1 -dump-charset utf8'
-
- _htmlEncodingExtractor = HTMLEncodingExtractor()
- _htmlFormatter = HTMLFormatter()
-
- name2codepoint = htmlentitydefs.name2codepoint
-
-
- def _translateEntities (mo):
- en = mo.group(0)
- if en.startswith(''):
- if en[2] == 'x':
- try:
- return unichr(int(en[3:-1], 16))
- except:
- return en
- else:
- try:
- return unichr(int(en[2:-1]))
- except:
- return en
- else:
- c = name2codepoint.get(en[1:-1])
- if c is None: return en
- else: return unichr(c)
-
-
- class _ReplaceHiddenURL (object):
- # improving performance by not having __dict__
- __slots__ = ('urlDict', 'spanList', 'base', 'attrs')
-
- def __init__ (self):
- self.urlDict = {} # a dict of valid (non-hidden) URLs
- self.spanList = [] # list of tuples (start,end) in the original HTML
- def __call__ (self, mo):
- """Must be called after self.base and self.attrs (a sets.Set) is set!"""
- attr = mo.group(1)
- value = mo.group(2).strip()
- httpMO = httpPat.search(value)
-
- if httpMO:
- # it's an URL
- if self.attrs is not None and attr is not None and attr.lower() in self.attrs:
- url = urllib.unquote(httpMO.group(0)).lower() # unquote the URL
- self.urlDict[url] = self.urlDict.get(url, 0) + 1
- return '%s="%s"' % (attr, url)
- else:
- self.spanList.append((mo.start(0) + self.base, mo.end(0) + self.base))
- return _TAG_FOR_HIDDEN_URL
- else:
- # it's not an URL
- if attr:
- return '%s="%s"' % (attr, value)
- else:
- return value
-
-
- class _ReplaceBadTag (object):
- # improving performance by not having __dict__
- __slots__ = ('replaceHiddenURL', 'spanList')
-
- def __init__ (self):
- self.replaceHiddenURL = _ReplaceHiddenURL()
- self.spanList = []
- def __call__ (self, mo):
- tag = mo.group(1).lower()
- base = mo.start(2) # for replaceHiddenURL.spanList
- self.replaceHiddenURL.base = base
-
- attributes = mo.group(2)
- if attributes is not None:
- attributes = attributes.lower()
- if len(attributes.strip()) == 0: attributes = None
-
- if tag[0] == '/':
- # it's an end tag
- if tag[1:] in globalObjects.htmlTags:
- if attributes is not None:
- # we don't need to unquote the http strings cuz we just want to identify
- # it's an URL
- for moIter in filter(lambda mo: httpPat.match(mo.group(2).strip()),
- _attrPat.finditer(attributes)):
- # yes the value is an URL
- self.replaceHiddenURL.spanList.append((moIter.start(2) + base,
- moIter.end(2) + base))
- return '<%s>' % tag
- else:
- # a bad tag
- self.spanList.append((mo.start(0), mo.end(0)))
- return ''
-
- elif tag.startswith('!--'):
- # don't count comments as bad tags
- return ''
-
- # it's a starting tag
- attrs = globalObjects.htmlTags.get(tag, False)
- if attrs is not False:
- if attrs is None:
- # no URL is allowed in attributes
- if attributes is None: return '<%s>' % tag
- else:
- self.replaceHiddenURL.attrs = None
- attributes = _attrPat.sub(self.replaceHiddenURL, attributes)
- return '<%s %s>' % (tag, attributes)
- else:
- # URL is allowed only after a certain attribute name
- if attributes is None:
- return '<%s>' % tag
- else:
- # if attrs is '*' we don't check for hidden URLs
- if isinstance(attrs, sets.Set):
- self.replaceHiddenURL.attrs = attrs
- attributes = _attrPat.sub(self.replaceHiddenURL, attributes)
- return '<%s %s>' % (tag, attributes)
- else:
- # a bad tag
- self.spanList.append((mo.start(0), mo.end(0)))
- return ''
-
-
- class _ReplaceVacuousTag (object):
- # improving performance by not having __dict__
- __slots__ = ('urlDict', 'spanList')
-
- def __init__ (self, urlDict):
- self.urlDict = urlDict
- self.spanList = []
- def __call__ (self, mo):
- if mo.group(2).lower() == mo.group(4).lower():
- # remove URL, if any, from urlDict
- httpMO = httpPat.search(mo.group(3))
- if httpMO:
- url = httpMO.group(0).lower()
- if self.urlDict.has_key(url):
- self.urlDict[url] -= 1
-
- self.spanList.append((mo.end(1), mo.start(5)))
- return ''.join((mo.group(1), mo.group(5)))
- else:
- return mo.group(0)
-
-
- class HTMLBody (object):
- """
- An HTML message body
- --------------------
- I. the following are set by __init__()
-
- htmlSrc: the raw source, in Unicode
- encoding: the encoding of the HTML message (could be None)
-
- contentWithoutEntities: the content without entities, in Unicode
- contentWithoutBadTags: the content without bad tags/hidden URLs, in Unicode
- content: final content without vacuous tags, in Unicode
-
- urlDict: a dict of (URL, count) tuples; URLs are unquoted and count could be 0
-
- hiddenURLList: a list of (start, end) index tuples for identified hidden URLs
- (w.r.t. contentWithoutEntities)
- badTagList: a list of (start, end) index tuples for identified bad tags
- (w.r.t. contentWithoutEntities)
- vacuousTagList: a list of (start, end) index tuples for identified vacuous tags
- (w.r.t. contentWithoutBadTags)
-
- II. others:
-
- rendering: set by setRendering() (via elinks).
- """
- # improving performance by not having __dict__
- __slots__ = ('htmlSrc', 'encoding',
- 'contentWithoutEntities', 'contentWithoutBadTags', 'content',
- 'urlDict', 'hiddenURLList', 'badTagList', 'vacuousTagList', 'rendering')
-
- def __init__ (self, htmlSrc, defaultEncoding = None):
- """htmlSrc must be raw data (no encoding)."""
- # get the encoding (charset in the meta tag)
- encoding = _htmlEncodingExtractor.extract(htmlSrc)
- if encoding:
- # correct possible mispellings
- mispelling = charsetMispellings.get(encoding)
- if mispelling: encoding = mispelling
- self.encoding = encoding
- else:
- self.encoding = defaultEncoding
-
- # self.encoding can fall back to the defaultEncoding,
- # but encoding is *extracted* from htmlSrc, which can be None
-
- htmlSrc = decodeText(htmlSrc, self.encoding) # decode into Unicode
- self.htmlSrc = _htmlFormatter.format(htmlSrc, # rewrite/insert charset
- encoding is None,
- _htmlEncodingExtractor.hasTagHTML,
- _htmlEncodingExtractor.hasTagHead)
- htmlSrc = _entityPat.sub(_translateEntities, htmlSrc).strip() # rid of entities
-
- # At this point: both htmlSrc and self.htmlSrc are in Unicode; their differences:
- # 1. self.htmlSrc keeps all entities, but htmlSrc doesn't
- # 2. in the meta tag, self.htmlSrc always has charset=utf8, but htmlSrc keeps the
- # original.
- # 3. self.htmlSrc will be fed to elinks (which can't deal with lots of charsets),
- # and htmlSrc will be used to produce other content* attributes
-
- self.contentWithoutEntities = htmlSrc
-
- # cleaning bad tags and hidden URLs
- replaceBadTag = _ReplaceBadTag()
- self.contentWithoutBadTags = _tagPat.sub(replaceBadTag, htmlSrc).strip()
- self.hiddenURLList = replaceBadTag.replaceHiddenURL.spanList
- self.urlDict = replaceBadTag.replaceHiddenURL.urlDict
- self.badTagList = replaceBadTag.spanList
-
- # removing vacuous tags
- replaceVacuousTag = _ReplaceVacuousTag(self.urlDict)
- self.content = _vacuousTagPat.sub(replaceVacuousTag, self.contentWithoutBadTags).strip()
- self.vacuousTagList = replaceVacuousTag.spanList
-
- def setRendering (self):
- """Returns HTML rendering using elinks."""
- if hasattr(self, 'rendering'): return self.rendering
-
- # invoke elinks
- try:
- #print encodeText(self.htmlSrc)
- elinksIn, elinksOut = os.popen2('"%s"elinks %s'%(BIN_PATH, _ELINKS_ARGS))
- elinksIn.write(encodeText(self.htmlSrc))
- elinksIn.close()
- self.rendering = decodeText(elinksOut.read(), 'utf8') # do NOT strip()!
- elinksOut.close()
-
- except:
- self.rendering = u'[executing elinks caused unknown errors]'
-
-
- if __name__ == '__main__':
- import sys
-
- if len(sys.argv) == 1:
- print 'Usage: ./HTMLBody.py <filename>'
- print ' * filename is the name of the file containing HTML raw source.'
- sys.exit(1)
-
- htmlBody = HTMLBody(open(sys.argv[1]).read())
- print encodeText(htmlBody.content)
-
- htmlBody.setRendering()
- print encodeText(htmlBody.rendering)
-